// This program maps raw values to clean, labeled integers using an external crosswalk.

**************************************************************************************
**************************************************************************************
**************************************************************************************

capture program drop encodefrom

program define encodefrom, nclass
	syntax varname using/, 	filetype(string) raw(string) clean(string) label(string) ///
		[delimiters(string)] [sheet(string)]  ///
		[label_name(string)] ///
		[noallow_missing] [CASEignore]

	version 8.0

	// declare temporary variables
	tempvar merge code labels N
	
	// declare error codes
	local syntaxError 198
	
	// display process comment
	display ""
	display "encoding `varlist' from `using'... "
	display ""
	
	// determine if varlist is string or number
	cap confirm numeric variable `varlist' 
	local type_string_raw = _rc
	
	// preserve the existing data 
	preserve
	
	**************************************************************************************
	
	*** DEFINE MAPPING FROM RAW TO CLEAN VALUES ***
	
	// sheet can be specified if and only if using excel
	// delimiters can be specified if and only if using delimited file
	if ("`sheet'" != "") & ("`filetype'" != "excel") {
		display as error "Cannot specify sheet unless using excel filetype"
		exit `syntaxError'
	} 
	if ("`delimiters'" != "") & ("`filetype'" != "delimited") {
		display as error "Cannot specify delimiters unless using delimited filetype"
		exit `syntaxError'
	}
	
	// get code values matched to raw values 
	if "`filetype'" == "excel" {
		qui import excel `"`using'"', sheet(`sheet') firstrow clear
	}
	else if "`filetype'" == "delimited" {
		qui insheet using `"`using'"', delimiter("`delimiters'") names case clear
	}
	else if "`filetype'" == "stata" {
		qui use `"`using'"', clear
	}
	else {
		display as error "`filetype' is not a valid filetype"
		exit `syntaxError'
	}
	
	qui keep `clean' `raw' `label'
	
		// allow for raw and (label or clean) to be same spreadsheet column
		foreach v in label clean { 
			if "``v''" == "`raw'" {
				tempvar `v'
				qui gen ``v'' = `raw'
			}
		}

	// if no label name specified, default to variable name
	if "`label_name'" == "" {
		local label_name `varlist'
	}

	//  determine if potential values are string or numeric
	cap confirm numeric variable `raw'
	local type_string_pot = _rc

	// make `raw' string or not depending on `varlist' format, and reduce to 
	// unique combinations of raw, clean, and label values 
	if `type_string_raw' | `type_string_pot'  {
		qui tostring `raw', replace usedisplayformat 
		qui drop if `raw' == "."
		if "`caseignore'" == "caseignore" {
			qui replace `raw' = lower(`raw')
		}
	}

	// rename variables to tempvars
	qui gen `code' = `clean'
	qui gen `labels' = `label'
	drop `clean' `label'

	// save matched codes data set
	if "`raw'" != "`varlist'" {
		rename  `raw' `varlist'	
	}
	
	**************************************************************************************
	
	*** DEFINE MAPPING FROM CLEAN VALUES TO LABELS ***
	
	// define label: this is a PITA (plug in the answer) method to store a local for each value label
	qui levelsof `code', local(codes_clean)
	foreach x of local codes_clean {
		forvalues i = 1/`=_N'{
			if `code'[`i'] == `x' {
				local label_`x' = `labels'[`i']
				break
			}
		}
	}

	// save codes mapping
	qui drop if missing(`varlist')
	qui duplicates drop
	tempfile codes
	qui save `codes'

	// verify that only one label is supplied for each clean code value	
	qui bysort `code' `labels': keep if (_n == 1)
	qui bysort `code': gen `N' = _N	
	qui cap assert (`N' == 1)
	if _rc { 
		display as error "The following code values are assigned to more than one label"
		list `code' `labels' if (`N' > 1)
		exit _rc
	}
		
	**************************************************************************************
	
	*** ENCODE AND LABEL VARIABLES ***
	
	// restore master data
	restore

	// make raw values string if potential values are strings
	if `type_string_raw' | `type_string_pot' {
		qui tostring `varlist', replace usedisplayformat
		if "`caseignore'" == "caseignore" {
			qui replace `varlist' = lower(`varlist')
		}
	}

	// merge with clean values
	qui merge m:1 `varlist' using `codes', keep(master match) keepusing(`code') gen(`merge')

	// verify that all raw values could be found in the provided spreadsheet
	qui cap assert (`merge' != 1 | missing(`varlist'))
	if _rc {
		display as error "The following values for `varlist' were not found in the supporting spreadsheet:"
		tab `varlist' if (`merge' == 1)
		exit _rc
	}	
		
	// verify that all raw values have found a non-missing clean value unless allow_missing is specified
	qui cap assert !missing(`code') if !missing(`varlist')
	if _rc & "`allow_missing'" == "noallow_missing" {
		display as error "The following raw values for `varlist' must be mapped to non-missing clean values:" 
		tab `varlist' if missing(`code')
		exit _rc
	}

	// replace raw values with clean values
	local lbl: var label `varlist'
	drop `varlist'
	qui gen `varlist' = `code'
	label var `varlist' "`lbl'"

	// label values
	cap label drop `label_name'
	local i = 1
	foreach x of local codes_clean {
		label define `label_name' `x' "`label_`x''", modify
	}	
	label values `varlist' `label_name'
	drop `merge' `code' 
	
	// tab new codes
	qui compress `varlist'
	tab `varlist', missing
	di ""

end	

**************************************************************************************
**************************************************************************************
**************************************************************************************

